Предобработка данных

Чтение, слияние, редактирование исходных датасетов

  • Загрузка датасетов с информацией о здоровых людях (info_healthy) и пациентах с СРК (info_ibs)
info_healthy <- read_xlsx ("data/raw/final_health_statistic.xlsx")
info_ibs <- read_xlsx ("data/raw/final_ibs_141_statistic.xlsx")
  • Сравнение переменных в info_healthy и info_ibs
## Уникальные переменные в info_ibs:
## Main_Disease
## Weight_kg
## Height_cm
## BMI
## Diet_type
## Diet_duration
## Additive_usage
## Уникальные переменные в info_healthy:
## Drugs

combined_info

  • Объединение info_healthy и info_ibs в единый датасет combined_info c редактированием содержимого
combined_info <- info_healthy %>% 
  bind_rows(info_ibs) %>%
  mutate (
    BMI_min = ifelse (is.na (BMI_min), round (Weight_min /(Height_max/100 * Height_max/100), 2), BMI_min),
    BMI_max = ifelse (is.na (BMI_max), round (Weight_max /(Height_min/100 * Height_min/100), 2), BMI_max)
    ) %>% 
  unite("BMI_range", BMI_min, BMI_max, sep = "-", na.rm = TRUE) %>%
  unite ("Age_range", Age_min, Age_max, sep = "-", na.rm = TRUE) %>%
  mutate(
    Age_range = case_when(
      Age_range == "18-40" | Age_range == "23-28" | Age_range == "16-42" | Age_range == "21-43" ~ "16-43",
      Age <= 43 ~ "16-43",
      Age > 43 ~ "> 43",
      TRUE ~ NA_character_), #удалена группа 28-54
    research_ID = sub ("research_", "", research_ID),
    research_ID = case_when(
      research_ID == 0 ~ 1,
      research_ID == 1 ~ 2,
      research_ID == 2 ~ 3,
      research_ID == 3 ~ 4,
      research_ID == 4 ~ 5, 
      research_ID == 6 ~ 6, 
      research_ID == 7 ~ 7), 
    patient_ID = row_number(),
    Sex = ifelse (Sex == "mixed", NA, Sex),
    Smoking = sub ("never", "Never",  Smoking),
    Smoking = case_when(
      Smoking == "No" ~ "No",
      Smoking == "Never" ~ "No",
      Smoking == "Rarely (a few times/month)" ~ "Yes", #5 чел.
      Smoking == "Occasionally (1-2 times/week)" ~ "Yes",  #3 чел.
      Smoking == "Regularly (3-5 times/week)" ~ "Yes",  #1 чел.
      Smoking == "Daily" ~ "Yes"), #7 чел.
    Alcohol = sub ("rarely", "Rarely", Alcohol),
    Alcohol = ifelse(Alcohol == "Regularly (3-5 times/week)"|
                       Alcohol == "Daily", #3 чел.
                     "Regularly (3-7 times/week)", Alcohol),
    Antibiotics_usage = case_when(
      Antibiotics_usage == "Month" | Antibiotics_usage == ~ "3 months" |
        Antibiotics_usage == "6 months" ~ "1-6 months",
        #2 чел. больных Month, 0 чел. здоровых 3 months, 0 чел. здоровых для 6 months
      Antibiotics_usage == "Year" | Antibiotics_usage == "Not use"  ~ 
        "12 months/Not use"), # 0 чел. из здоровых для 6-12 months, 0 чел. больных для Not use
    Hygiene = case_when(
      Hygiene == "Occasionally (1-2 times/week) cosmetics" ~ "Occasionally cosmetics (1-2 times/week)",
      Hygiene == "Rarely (a few times/month) cosmetics" ~ "Rarely cosmetics (a few times/month)",
      TRUE ~ Hygiene),
    Hygiene = case_when(
      Hygiene == "Daily cosmetics"|Hygiene == "Regularly cosmetics (3-5 times/week)" ~ "Regularly (3-7 times/week)", #0 чел. больных  для 3-5 times/week
      Hygiene == "Occasionally cosmetics (1-2 times/week)" | Hygiene == "Rarely cosmetics (a few times/month)" ~ "Occasionally (a few-8 times/month)",
      #только 2 чел. больных для 1-2 times/week
      Hygiene == "Never cosmetics" ~ "Never"),
    Physical_activity = sub ("regularly", "Regularly",  Smoking),
    BMI = ifelse (is.na(Weight_kg), BMI, Weight_kg/ (Height_cm/100 * Height_cm/100)),
    BMI_range = ifelse(BMI_range == "", NA, BMI_range),
    BMI_category = case_when(
      BMI_range == "18-25" ~ "normal/overweight",
      BMI_range == "19.21-29.29" ~ "normal/overweight",
      BMI_range == "20.6-29.6" ~ "normal/overweight",
      BMI_range == "21.74-28.38" ~ "normal/overweight",
      BMI_range == "18.5-30.8" ~ "normal/overweight", #немного больше 30
      BMI < 18.5 ~ "underweight",
      BMI >= 18.5 & BMI < 30 ~ "normal/overweight",
      BMI >= 30 ~ "obese")
    ) %>% 
  rename ("Cosmetics" = Hygiene ) %>% 
  
  mutate_if (is.character, as.factor) %>% 
  
  select(-c(
    Instrument, # unique (combined_info$Instrument) = "Illumina MiSeq" 
    Isolation_source, # unique (combined_info$Isolation_source) = "faeces" 
    Assay_type, # unique (combined_info$Assay_type) = "AMPLICON"
    Target_gene, # unique (combined_info$Target_gene) = "16S"
    Main_Disease, # unique (combined_info$Main_Disease) = NA (for healthy), 141 (for ibs)
    Drugs, # unique (combined_info$Drugs) = NA
    Social_status, # unique (combined_info$Social_status) =  NA, urban 
    Weight_kg, Height_cm, Weight_min, Weight_max, Height_min, Height_max, BMI_range, #использованы для создания BMI_category, уменьшения количества NA в BMI
    BMI, # NA у всех здоровых
    Birth_Year, # имеются значения только в тех случаях, где возраст уже известен
    Pets_type # только cat и NA
  ))

rm (info_healthy, info_ibs)
summary (combined_info)
##   research_ID      patient_ID  Seq_region     Seq_date     Health_state
##  Min.   :1.000   Min.   :  1   V3-V4:106   Min.   :2015   Disease:170  
##  1st Qu.:3.000   1st Qu.: 96   V4   :229   1st Qu.:2017   Health :211  
##  Median :4.000   Median :191   V5-V6: 46   Median :2018                
##  Mean   :4.194   Mean   :191               Mean   :2019                
##  3rd Qu.:6.000   3rd Qu.:286               3rd Qu.:2021                
##  Max.   :7.000   Max.   :381               Max.   :2023                
##                                                                        
##       Age        Age_range       Sex         Country  
##  Min.   :19.00   > 43 : 44   female: 79   Spain  :95  
##  1st Qu.:27.25   16-43:242   male  :112   Poland :70  
##  Median :34.00   NA's : 95   NA's  :190   Belgium:46  
##  Mean   :38.71                            Germany:45  
##  3rd Qu.:47.00                            Italy  :37  
##  Max.   :76.00                            Austria:28  
##  NA's   :247                              (Other):60  
##                         Race     Smoking                             Alcohol   
##  African American         :  1   No  :187   Never                        : 12  
##  Asian or Pacific Islander:  1   Yes : 16   Occasionally (1-2 times/week): 24  
##  Caucasian                :116   NA's:178   Rarely (a few times/month)   : 36  
##  Hispanic                 :  1              Regularly (3-7 times/week)   : 15  
##  Other                    : 11              NA's                         :294  
##  NA's                     :251                                                 
##                                                                                
##          Antibiotics_usage Physical_activity  Travel_period
##  1-6 months       : 48     No  :187          3 months: 18  
##  12 months/Not use:129     Yes : 16          6 months: 10  
##  NA's             :204     NA's:178          Month   : 13  
##                                              Year    : 45  
##                                              NA's    :295  
##                                                            
##                                                            
##          Education_level                              Cosmetics  
##  Bachelor's level: 31    Never                             : 44  
##  Master's level  : 28    Occasionally (a few-8 times/month): 17  
##  School Level    : 25    Regularly (3-7 times/week)        : 25  
##  NA's            :297    NA's                              :295  
##                                                                  
##                                                                  
##                                                                  
##    Sleep_duration Diet_type  Diet_duration Additive_usage
##  > 8 hours:  9    10.0: 13   3.0 :  8      22.0:  1      
##  4-6 hours:  9    5.0 :  5   6.0 :  5      23.0:  1      
##  6-7 hours: 31    NA's:363   NA's:368      4.0 :  9      
##  7-8 hours: 38                             NA's:370      
##  NA's     :294                                           
##                                                          
##                                                          
##             BMI_category
##  normal/overweight:287  
##  obese            :  5  
##  underweight      :  1  
##  NA's             : 88  
##                         
##                         
## 
  • Таблица с количеством субъектов (n) в зависимости от исследования, даты, наличия/отсутствия СРК
combined_info %>% 
  select (research_ID, Country, Seq_date, Health_state) %>% 
  group_by(research_ID, Country, Seq_date, Health_state) %>% 
  summarise(n = n()) %>% 
  flextable() %>% theme_box() %>% 
  merge_v(c ("research_ID", "Country", "Seq_date", "Health_state")) %>% 
  align(align = "center", part = "all")

research_ID

Country

Seq_date

Health_state

n

1

Belgium

2,018

Health

46

2

Italy

36

3

Poland

2,023

70

4

Australia

2,021

6

Austria

2,020

3

Germany

33

2,021

12

Greece

1

Hungary

2

Ireland

Disease

1

Israel

Health

1

Italy

1

Norway

2,018

Disease

1

UK

7

2,020

2

2,021

3

USA

2,018

6

2,020

1

2,021

5

2,022

1

2,023

1

5

Austria

2,017

25

6

Israel

2,022

22

7

Spain

2,015

95

tbl_summary of combined_info

library(gtsummary)

combined_info %>% 
  select(! c(patient_ID,
              Diet_duration, Additive_usage, Diet_type #у всех здоровых данные переменные = NA
              )) %>% 
  tbl_summary(digits = list(all_continuous() ~ c(0, 0),
                            all_categorical() ~ c(0, 0)),
              by = Health_state) %>%
  add_p()
Characteristic Disease, N = 1701 Health, N = 2111 p-value2
research_ID

<0.001
    1 0 (0%) 46 (22%)
    2 0 (0%) 36 (17%)
    3 0 (0%) 70 (33%)
    4 28 (16%) 59 (28%)
    5 25 (15%) 0 (0%)
    6 22 (13%) 0 (0%)
    7 95 (56%) 0 (0%)
Seq_region

<0.001
    V3-V4 0 (0%) 106 (50%)
    V4 170 (100%) 59 (28%)
    V5-V6 0 (0%) 46 (22%)
Seq_date

<0.001
    2015 95 (56%) 0 (0%)
    2017 25 (15%) 0 (0%)
    2018 14 (8%) 82 (39%)
    2020 3 (2%) 36 (17%)
    2021 9 (5%) 23 (11%)
    2022 23 (14%) 0 (0%)
    2023 1 (1%) 70 (33%)
Age 42 (32, 50) 28 (26, 37) <0.001
    Unknown 95 152
Age_range

<0.001
    > 43 34 (45%) 10 (5%)
    16-43 41 (55%) 201 (95%)
    Unknown 95 0
Sex

0.15
    female 35 (48%) 44 (37%)
    male 38 (52%) 74 (63%)
    Unknown 97 93
Country


    Australia 0 (0%) 6 (3%)
    Austria 25 (15%) 3 (1%)
    Belgium 0 (0%) 46 (22%)
    Germany 0 (0%) 45 (21%)
    Greece 0 (0%) 1 (0%)
    Hungary 0 (0%) 2 (1%)
    Ireland 1 (1%) 0 (0%)
    Israel 22 (13%) 1 (0%)
    Italy 0 (0%) 37 (18%)
    Norway 1 (1%) 0 (0%)
    Poland 0 (0%) 70 (33%)
    Spain 95 (56%) 0 (0%)
    UK 12 (7%) 0 (0%)
    USA 14 (8%) 0 (0%)
Race

0.4
    African American 0 (0%) 1 (1%)
    Asian or Pacific Islander 0 (0%) 1 (1%)
    Caucasian 27 (100%) 89 (86%)
    Hispanic 0 (0%) 1 (1%)
    Other 0 (0%) 11 (11%)
    Unknown 143 108
Smoking 4 (14%) 12 (7%) 0.2
    Unknown 142 36
Alcohol

0.002
    Never 4 (14%) 8 (14%)
    Occasionally (1-2 times/week) 4 (14%) 20 (34%)
    Rarely (a few times/month) 9 (32%) 27 (46%)
    Regularly (3-7 times/week) 11 (39%) 4 (7%)
    Unknown 142 152
Antibiotics_usage

0.020
    1-6 months 2 (8%) 46 (30%)
    12 months/Not use 23 (92%) 106 (70%)
    Unknown 145 59
Physical_activity 4 (14%) 12 (7%) 0.2
    Unknown 142 36
Travel_period

0.066
    3 months 3 (11%) 15 (26%)
    6 months 1 (4%) 9 (16%)
    Month 4 (14%) 9 (16%)
    Year 20 (71%) 25 (43%)
    Unknown 142 153
Education_level

<0.001
    Bachelor's level 5 (18%) 26 (46%)
    Master's level 18 (64%) 10 (18%)
    School Level 5 (18%) 20 (36%)
    Unknown 142 155
Cosmetics

0.6
    Never 15 (56%) 29 (49%)
    Occasionally (a few-8 times/month) 6 (22%) 11 (19%)
    Regularly (3-7 times/week) 6 (22%) 19 (32%)
    Unknown 143 152
Sleep_duration

0.2
    > 8 hours 4 (14%) 5 (8%)
    4-6 hours 4 (14%) 5 (8%)
    6-7 hours 12 (43%) 19 (32%)
    7-8 hours 8 (29%) 30 (51%)
    Unknown 142 152
BMI_category

0.012
    normal/overweight 135 (96%) 152 (100%)
    obese 5 (4%) 0 (0%)
    underweight 1 (1%) 0 (0%)
    Unknown 29 59
1 n (%); Median (IQR)
2 Pearson’s Chi-squared test; Wilcoxon rank sum test; Fisher’s exact test
  • Загрузка датасетов с данными о микробиоме здоровых людей (bacteria_healthy) и пациентов с СРК (bacteria_ibs) и их объединение в combined_bacteria
bacteria_healthy <- read_csv("data/raw/final_bacteria_health.csv")
bacteria_ibs <- read_csv("data/raw/final_bacteria_ibs_141.csv")

combined_bacteria <- bacteria_healthy %>% 
  bind_rows (bacteria_ibs) %>% 
  mutate(patient_ID = row_number())

rm (bacteria_healthy, bacteria_ibs)
  • Объединение combined_bacteria и combined_info в data_wide
data_wide <- combined_info %>% 
  left_join (combined_bacteria)

Оценка доли NA и нулевых значений

  • Оценка доли NA в переменных data_wide
data_wide %>% 
  select (where (function(x) sum (is.na(x))/ nrow(data_wide) * 100 > 0)) %>% 
  sapply (function(x) sum (is.na(x))/ nrow(data_wide) * 100) %>% round(1) %>% 
  as.data.frame() %>% 
  rename(NA_percentage = ".") %>% 
  mutate (
    "Number of people with known data" = round (nrow(data_wide) - NA_percentage/100 * nrow(data_wide)),
    NA_percentage = paste (NA_percentage, "%", sep = " ")
    ) %>% 
  arrange(desc (NA_percentage)) %>% 
  rownames_to_column() %>% 
  as_tibble() %>% flextable()

rowname

NA_percentage

Number of people with known data

Additive_usage

97.1 %

11

Diet_duration

96.6 %

13

Diet_type

95.3 %

18

Education_level

78 %

84

Travel_period

77.4 %

86

Cosmetics

77.4 %

86

Alcohol

77.2 %

87

Sleep_duration

77.2 %

87

Race

65.9 %

130

Age

64.8 %

134

Antibiotics_usage

53.5 %

177

Sex

49.9 %

191

Smoking

46.7 %

203

Physical_activity

46.7 %

203

Age_range

24.9 %

286

BMI_category

23.1 %

293

  • Рассчитаем процент не NA и не 0 данных по колонкам широкого формата
# Устанавливаем порог процента 
threshold_percent <- 95 
 
# Функция для вычисления процента записей, не равных NA и не равных 0, для каждой колонки 
calculate_percentage <- function(col) { 
  sum(!is.na(col) & col != 0) / length(col) * 100 
} 
 
# Применяем функцию к каждой колонке в датасете 
percentage_non_zero_non_na <- sapply(data_wide[, -1], calculate_percentage) 
 
# Создаем датафрейм с результатами 
result_df_sort <- data.frame( 
  column = names(percentage_non_zero_non_na), 
  percentage = round(100 - percentage_non_zero_non_na, 2) # percentage означает пропущенные или 0 значения
) %>% 
  arrange(desc(percentage))
 
# Отфильтровываем колонки, у которых процент записей менее threshold_percent% 
filtered_columns <- result_df_sort[result_df_sort$percentage < threshold_percent, ]

# Сохраним датасет в excel для дальнейшего анализа
write.xlsx(filtered_columns, 
           file = "data/originals/percentage_by_vars.xlsx")

# Перезапись data_wide с выбором колонок с процентом NA/0 менее threshold_percent 
data_wide <- data_wide %>% 
  select (row.names(filtered_columns), research_ID)

rm (calculate_percentage, result_df_sort)
  • Рассчитаем процент не NA и не 0 данных по форматированному patient_ID
# Устанавливаем порог процента 
threshold_percent <- 95
 
# Рассчитываем процент значений, не являющихся NA и не равных 0, для каждого пациента 
percentage_non_zero_non_na <- rowMeans(!is.na(data_wide) & data_wide != 0, na.rm = TRUE) * 100 
 
# Создаем датафрейм с результатами 
result_df_sort <- data.frame( 
  patient_id = data_wide$patient_ID, 
  percentage = round(100 - percentage_non_zero_non_na, 2) # percentage означает пропущенные или 0 значения
) %>% arrange(desc(percentage))

# Отфильтровываем пациентов, у которых процент значений менее threshold_percent% 
filtered_patients <- result_df_sort[result_df_sort$percentage < threshold_percent, ] 

# Сохраним датасет в excel для дальнейшего анализа 
write.xlsx(filtered_patients,
           file = "data/originals/percentage_by_patient.xlsx") 

# Перезапись data_wide с удалением строк с процентом NA/0 более threshold_percent 
data_wide <- data_wide %>% 
  slice (filtered_patients$patient_id) #при threshold_percent = 95%, изменения data_wide не происходит, так как нет пациентов с процентом NA/0 более 95%

rm (percentage_non_zero_non_na, result_df_sort)
  • Сохранение data_wide после удаления колонок и строк с процентом NA/0 более threshold_percent
data_wide <- data_wide %>% 
  select (patient_ID, any_of (colnames(combined_info)), everything()) %>% 
  arrange(patient_ID)

write_rds(data_wide, 
          file = "data/originals/data_wide.rds")

Поиск “таксонов одного исследования”

Среднее содержание G_таксонов, которые были обнаружены только в одном исследовании

G_only_one <- data_wide %>% 
  select (research_ID, ends_with("_G")) %>%
  add_column(n = 1) %>% #колонка для дальнейшего подсчёта количества субъектов в исследовании
  group_by(research_ID) %>% 
  summarise_each (sum) %>% 
  select (research_ID, n, where (function(x) sum (x!=0) == 1)) %>%  #выбираем таксоны, которые обнаружены только в одном исследовании 
  mutate (across(-c(1:2),
                 function(x) x/n))
## Warning: `summarise_each_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `across()` instead.
## ℹ The deprecated feature was likely used in the dplyr package.
##   Please report the issue at <https://github.com/tidyverse/dplyr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
G_only_one
## # A tibble: 7 × 15
##   research_ID     n CM1G08_G Cladosporium_G Lentimonas_G Micromonospora_G
##         <dbl> <dbl>    <dbl>          <dbl>        <dbl>            <dbl>
## 1           1    46  0              0             0.0500           0.0331
## 2           2    36  0              0.00811       0                0     
## 3           3    70  0              0             0                0     
## 4           4    87  0.00550        0             0                0     
## 5           5    25  0              0             0                0     
## 6           6    22  0              0             0                0     
## 7           7    95  0              0             0                0     
## # ℹ 9 more variables: Pseudosphingobacterium_G <dbl>,
## #   `Schizothrix LEGE 07164_G` <dbl>, Talaromyces_G <dbl>,
## #   `Rs-D38 termite group_G` <dbl>, Kabatiella_G <dbl>,
## #   `Anaerolineaceae UCG-001_G` <dbl>, Iamia_G <dbl>, Thiohalocapsa_G <dbl>,
## #   Ellin517_G <dbl>
  • Определение процента людей, у которых данные таксоны не обнаружены ()
G_taxon <- 'Ellin517_G'
res_ID <- 7

data_wide %>% 
  select (research_ID, G_taxon) %>%
  filter (research_ID == res_ID) %>% 
  summarise(across (-1,
              function (x) sum (x==0)/nrow (.) * 100)) %>% 
  rename ("Taxon_zero_percentage, %" = G_taxon) 
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(G_taxon)
## 
##   # Now:
##   data %>% select(all_of(G_taxon))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 1 × 1
##   `Taxon_zero_percentage, %`
##                        <dbl>
## 1                          0
# CM1G08_G - 73,6% нулей
# Cladosporium_G - 94,0% нулей
# Lentimonas_G - 94,0% нулей
# Micromonospora_G - 93,2% нулей
# Pseudosphingobacterium_G - 93,2% нулей 
# Schizothrix LEGE 07164_G - 92,7% нулей 
# Talaromyces_G - 92,7% нулей 
# Rs-D38 termite group_G - 91,9% нулей 
# Kabatiella_G - 90,6% нулей 
# Anaerolineaceae UCG-001_G - 89,5% нулей 
# Iamia_G - 88,2% нулей 
# Thiohalocapsa_G - 87,9% нулей 
# Ellin517_G - 75,1% нулей 

rm (G_taxon, res_ID)

Среднее содержание F_таксонов, которые были обнаружены только в одном исследовании

data_wide %>% 
  select (research_ID, ends_with("_F")) %>%
  add_column(n = 1) %>% #колонка для дальнейшего подсчёта количества субъектов в исследовании
  group_by(research_ID) %>% 
  summarise_each (sum) %>% 
  select (research_ID, n, where (function(x) sum (x!=0) == 1)) %>%  #выбираем таксоны, которые обнаружены только в одном исследовании 
  mutate (across(-c(1:2),
                 function(x) x/n))
## # A tibble: 7 × 9
##   research_ID     n Didymellaceae_F Cladosporiaceae_F Trichocomaceae_F
##         <dbl> <dbl>           <dbl>             <dbl>            <dbl>
## 1           1    46          0                0                 0     
## 2           2    36          0.0213           0.00811           0.0143
## 3           3    70          0                0                 0     
## 4           4    87          0                0                 0     
## 5           5    25          0                0                 0     
## 6           6    22          0                0                 0     
## 7           7    95          0                0                 0     
## # ℹ 4 more variables: `type III_F` <dbl>, `09D2Z48_F` <dbl>,
## #   Aureobasidiaceae_F <dbl>, Iamiaceae_F <dbl>
  • Определение процента людей, у которых данные таксоны не обнаружены ()
F_taxon <- 'Iamiaceae_F'
res_ID <- 1

data_wide %>% 
  select (research_ID, F_taxon) %>%
  filter (research_ID == res_ID) %>% 
  summarise(across (-1,
              function (x) sum (x==0)/nrow (.) * 100)) %>% 
  rename ("Taxon_zero_percentage, %" = F_taxon) 
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(F_taxon)
## 
##   # Now:
##   data %>% select(all_of(F_taxon))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 1 × 1
##   `Taxon_zero_percentage, %`
##                        <dbl>
## 1                       2.17
# Didymellaceae_F - 44,4% нулей
# Cladosporiaceae_F - 36,1% нулей
# Trichocomaceae_F - 22,2% нулей
# type III_F - 30,4% нулей
# 09D2Z48_F - 23,9% нулей
# Aureobasidiaceae_F - 0% нулей 
# Iamiaceae_F - 2,2% нулей

rm (F_taxon, res_ID)

Среднее содержание O_таксонов, которые были обнаружены только в одном исследовании

data_wide %>% 
  select (research_ID, ends_with("_O")) %>%
  add_column(n = 1) %>% #колонка для дальнейшего подсчёта количества субъектов в исследовании
  group_by(research_ID) %>% 
  summarise_each (sum) %>% 
  select (research_ID, n, where (function(x) sum (x!=0) == 1)) %>%  #выбираем таксоны, которые обнаружены только в одном исследовании 
  mutate (across(-c(1:2),
                 function(x) x/n))
## # A tibble: 7 × 12
##   research_ID     n Hypocreales_O Pleosporales_O `Candidatus Abawacabacteria_O`
##         <dbl> <dbl>         <dbl>          <dbl>                          <dbl>
## 1           1    46        0              0                             0.00892
## 2           2    36        0.0622         0.0367                        0      
## 3           3    70        0              0                             0      
## 4           4    87        0              0                             0      
## 5           5    25        0              0                             0      
## 6           6    22        0              0                             0      
## 7           7    95        0              0                             0      
## # ℹ 7 more variables: `Candidatus Peregrinibacteria_O` <dbl>,
## #   Capnodiales_O <dbl>, `Candidatus Terrybacteria_O` <dbl>,
## #   Eurotiales_O <dbl>, Dothideales_O <dbl>, eub62A3_O <dbl>,
## #   `LD1-PA32_O` <dbl>
  • Определение процента людей, у которых данные таксоны не обнаружены ()
O_taxon <- 'Candidatus Abawacabacteria_O'
res_ID <- 1

data_wide %>% 
  select (research_ID, O_taxon) %>%
  filter (research_ID == res_ID) %>% 
  summarise(across (-1,
              function (x) sum (x==0)/nrow (.) * 100)) %>% 
  rename ("Taxon_zero_percentage, %" = O_taxon) 
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(O_taxon)
## 
##   # Now:
##   data %>% select(all_of(O_taxon))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 1 × 1
##   `Taxon_zero_percentage, %`
##                        <dbl>
## 1                       52.2
# Hypocreales_O - 41,7% нулей
# Pleosporales_O - 41,7% нулей
# Candidatus Abawacabacteria_O - 52,2% нулей
# Candidatus Peregrinibacteria_O - 52,2% нулей
# Capnodiales_O - 33,3% нулей
# Candidatus Terrybacteria_O - 39,1% нулей
# Eurotiales_O - 39,1% нулей
# Dothideales_O - 0% нулей
# eub62A3_O - 15,2% нулей
# LD1-PA32_O - 2,2% нулей

rm (O_taxon, res_ID)

Среднее содержание C_таксонов, которые были обнаружены только в одном исследовании

data_wide %>% 
  select (research_ID, ends_with("_C")) %>%
  add_column(n = 1) %>% #колонка для дальнейшего подсчёта количества субъектов в исследовании
  group_by(research_ID) %>% 
  summarise_each (sum) %>% 
  select (research_ID, n, where (function(x) sum (x!=0) == 1)) %>%  #выбираем таксоны, которые обнаружены только в одном исследовании 
  mutate (across(-c(1:2),
                 function(x) x/n))
## # A tibble: 7 × 5
##   research_ID     n WWE3_C Eurotiomycetes_C Dothideomycetes_C
##         <dbl> <dbl>  <dbl>            <dbl>             <dbl>
## 1           1    46 0                0                  0    
## 2           2    36 0                0.0229             0.102
## 3           3    70 0                0                  0    
## 4           4    87 0.0135           0                  0    
## 5           5    25 0                0                  0    
## 6           6    22 0                0                  0    
## 7           7    95 0                0                  0
  • Определение процента людей, у которых данные таксоны не обнаружены ()
С_taxon <- 'Dothideomycetes_C'
res_ID <- 2

data_wide %>% 
  select (research_ID, С_taxon) %>%
  filter (research_ID == res_ID) %>% 
  summarise(across (-1,
              function (x) sum (x==0)/nrow (.) * 100)) %>% 
  rename ("Taxon_zero_percentage, %" = С_taxon) 
## # A tibble: 1 × 1
##   `Taxon_zero_percentage, %`
##                        <dbl>
## 1                          0
# WWE3_C - 74,4% нулей
# Eurotiomycetes_C - 19,4% нулей 
# Dothideomycetes_C - 0% нулей 

rm (С_taxon, res_ID)

Среднее содержание P_таксонов, которые были обнаружены только в одном исследовании

P_таксонов, которые были обнаружены только в одном исследовании, нет

data_wide финальный

`

data_wide <- data_wide %>% 
  select (!colnames (G_only_one) [- c (1,2)]) #удаление G_таксонов, которые обнаружены только в одном исследовании (все они в этом исследовании встречались редко - до 25% людей) 

bac_functions

  • загрузка и редактирование bac_functions
# Чтение листов Excel-файла с функциями бактерий и их объединение
path <- "data/raw/Bacterial group functions.xlsx"
taxon <- c ("TaxonName", "Rank")

neuromediators <- read_xlsx (path, 2) %>% 
  mutate(Destroy = ifelse(is.na (Destroy), "produce", "destroy")) %>% 
  unique() %>%
  pivot_wider(names_from = Neuromediator, values_from = Destroy)

probiotics <- read_xlsx (path, 3) %>% 
  add_column(probiotics = 1)

special_properties <- read_xlsx (path, 4) %>% 
  add_column(special_properties = 1)

vitamins <- read_xlsx (path, 5) %>% 
  pivot_wider (names_from = Vitamin, values_from = Vitamin,
               values_fn = function(x) ifelse(is.na (x), NA, 1))

habbits <- read_xlsx (path, 7) %>%
  unique() %>% #удаление повторяющихся строк 
  pivot_wider(names_from = Habbit, values_from = Habit_state)

bac_functions <- read_xlsx (path, 1) %>% #Патогены и нежелательные
  full_join(neuromediators, by = taxon) %>% #Нейромедиаторы
  full_join(probiotics, by = taxon) %>% #Пробиотики
  full_join(special_properties, by = taxon) %>% #С особыми свойствами
  full_join(vitamins, by = taxon) %>%  #Витамины
  full_join(read_xlsx (path, 6), by = taxon) %>% #Продуценты КЦДК
  full_join(habbits, by = taxon) %>% #Вредные привычки
  
  unite("Taxon", TaxonName, Rank, sep = "_") %>% 
  filter (Taxon != "Blautia obeum_S") %>% #для данного таксона противоречивая информация в Продуценты КЦЖК
  mutate_all(as.factor)

rm (path, taxon, neuromediators, probiotics, special_properties, vitamins, habbits)

data_long

  • Перевод data_wide в длинный формат (data_long), объединение data_long и bac_functions
# Создание датасета в длинном формате
data_long <- data_wide %>% 
  pivot_longer(ends_with(c("_D", "_P", "_O", "_C", "_F", "_G")),
               names_to = "Taxon", values_to = "Percentage")

#Перезапись data_long с добавлением функций бактерий
data_long <- data_long %>% 
  left_join (bac_functions, by = "Taxon")

#Сохранение data_long.rds
write_rds(data_long, 
          file = "data/originals/data_long.rds",
          compress = "gz") 
  • Создание отдельных длинных датасетов для каждого таксона
G_long <- data_long %>% subset(grepl("_G", Taxon)) 
# %>% select (where (function(x) sum (is.na(x))/ nrow(.) * 100 < threshold_percent))
F_long <- data_long %>% subset(grepl("_F", Taxon))
C_long <- data_long %>% subset(grepl("_C", Taxon))
O_long <- data_long %>% subset(grepl("_O", Taxon))
P_long <- data_long %>% subset(grepl("_P", Taxon))

write_rds(G_long, 
          file = "data/originals/G_long.rds", "gz") 
write_rds(F_long, 
          file = "data/originals/F_long.rds", "gz") 
write_rds(C_long, 
          file = "data/originals/C_long.rds", "gz") 
write_rds(O_long, 
          file = "data/originals/O_long.rds", "gz") 
write_rds(P_long, 
          file = "data/originals/P_long.rds", "gz") 

Анализ

Факторный анализ

  • Проведем множественное сравнение каждого таксона из combined_bacteria с по группе Health_state из combined_info без учета таксономического уровня.
# Создание нового датасета для сохранения результатов
result_dataset <- data.frame(
  Variable_Name = character(),
  Test_Type = character(),
  P_Value = numeric(),
  Normal_Distribution = character(),
  stringsAsFactors = FALSE
)

alpha = 0.05

# Объединяем датасеты по patient_id 
combined_data <- data_wide %>%
  select(Health_state,
    ends_with(c("_D", "_P", "_O", "_C", "_F", "_G")))

combined_bacteria_clean <- combined_data

# Получаем список переменных из датасета combined_bacteria, исключая "patient_ID"
values_to_exclude <- c("patient_ID", "Seq_date", "Age")
variable_names <- setdiff(names(combined_bacteria_clean), values_to_exclude)

# Проходим по каждой переменной
for (variable in variable_names) {
  # Фильтрация данных (исключаем строки с NA и 0 в текущей переменной)
  filtered_data <- combined_data[!is.na(combined_data[[variable]]) & combined_data[[variable]] != 0, ]
  #print(variable)
  #print(filtered_data)

  # Проверим что датасет filtered_data не пустой и что количество групп сравнения более 1, в нашем случае их 2 :)
  if (nrow(filtered_data) > 0 & length(unique(filtered_data$Health_state)) > 1) {
    
  # Check if there is sufficient variability in the data
  if (length(unique(filtered_data$Health_state)) > 1) {
    # Проверка на нормальность
    shapiro_test_result <- try(shapiro.test(filtered_data[[variable]]), silent = TRUE)
    if (inherits(shapiro_test_result, "try-error")) {
      warning(paste("Skipping Shapiro-Wilk test for variable", variable, "due to an error."))
      next
    }
    p_value_shapiro <- shapiro_test_result$p.value

    # Выбор соответствующего статистического теста
    if (p_value_shapiro > 0.05) {
      # Если нормальное распределение, провести дисперсионный анализ
      model <- aov(filtered_data[[variable]] ~ Health_state, data = filtered_data)
      summary_list <- summary(model)
      test_type <- "ANOVA"
    } else {
      # Если не нормальное распределение, использовать тест Краскела-Уоллиса
      tryCatch({
        model <- kruskal.test(filtered_data[[variable]] ~ Health_state, data = filtered_data)
        test_type <- "Kruskal-Wallis"
      }, error = function(e) {
        warning(paste("Skipping Kruskal-Wallis test for variable", variable, "due to an error:", conditionMessage(e)))
        next
      })
    }

    # Добавление результатов выбранного теста в датасет
    result_dataset <- rbind(result_dataset, 
                            data.frame(Variable_Name = variable,
                                       Test_Type = test_type,
                                       P_Value = ifelse(test_type == "ANOVA", summary_list[[1]]$"Pr(>F)"[1], model$p.value),
                                       Normal_Distribution = ifelse(p_value_shapiro > 0.05, "Yes", "No")))
  } else {
    # If there is no variability, skip the tests
    warning(paste("Skipping tests for variable", variable, "as there is not enough variability in the data."))
  }
  }}
## Warning: Skipping Shapiro-Wilk test for variable Health_state due to an error.
# Добавление столбца с поправкой Бонферрони
result_dataset$Adjusted_P_Value_Bonferroni <- p.adjust(result_dataset$P_Value, method = "bonferroni")

# Добавление столбца с поправкой Холма
result_dataset$Adjusted_P_Value_Holm <- p.adjust(result_dataset$P_Value, method = "holm")

# Добавление столбца с поправкой Бенджамини-Хохберга
result_dataset$Adjusted_P_Value_BH <- p.adjust(result_dataset$P_Value, method = "BH")

result_dataset$test_pass = ifelse(result_dataset$Adjusted_P_Value_Bonferroni < alpha & result_dataset$Adjusted_P_Value_Holm < alpha & result_dataset$Adjusted_P_Value_BH < alpha, "Y", "N")

result_dataset_pass <- result_dataset %>% filter(test_pass == "Y")

# Вывод результатов
print(result_dataset_pass)
##                               Variable_Name      Test_Type      P_Value
## 1                               Eukaryota_D Kruskal-Wallis 3.641890e-07
## 2                            Nitrospirota_P Kruskal-Wallis 7.764816e-07
## 3                          Armatimonadota_P Kruskal-Wallis 5.555616e-10
## 4                         Acidobacteriota_P Kruskal-Wallis 3.112329e-09
## 5                         Patescibacteria_P Kruskal-Wallis 1.463219e-15
## 6                           Cyanobacteria_P Kruskal-Wallis 6.542425e-06
## 7                             Chloroflexi_P Kruskal-Wallis 6.316248e-23
## 8                       Verrucomicrobiota_P Kruskal-Wallis 8.425194e-09
## 9                        Actinobacteriota_P Kruskal-Wallis 6.600394e-23
## 10                         Proteobacteria_P Kruskal-Wallis 2.752566e-06
## 11                   Desulfitobacteriales_O Kruskal-Wallis 5.530423e-05
## 12                           Pirellulales_O Kruskal-Wallis 3.196572e-09
## 13                         Anaerolineales_O Kruskal-Wallis 3.746162e-09
## 14                        Xanthomonadales_O Kruskal-Wallis 1.874109e-06
## 15                         Pedosphaerales_O Kruskal-Wallis 1.302641e-15
## 16                      Acholeplasmatales_O Kruskal-Wallis 1.552118e-08
## 17                                SBR1031_O Kruskal-Wallis 3.378675e-19
## 18                     Sphingobacteriales_O Kruskal-Wallis 5.724693e-10
## 19                       Flavobacteriales_O Kruskal-Wallis 8.296231e-08
## 20                       Staphylococcales_O Kruskal-Wallis 2.450391e-21
## 21                           Monoglobales_O Kruskal-Wallis 2.153674e-06
## 22                      Bifidobacteriales_O Kruskal-Wallis 1.699744e-15
## 23                        Chitinophagales_O Kruskal-Wallis 5.210248e-17
## 24                             Bacillales_O Kruskal-Wallis 7.441037e-10
## 25             Clostridia vadinBB60 group_O Kruskal-Wallis 9.747928e-08
## 26                       Coriobacteriales_O Kruskal-Wallis 4.680932e-22
## 27                          Clostridiales_O Kruskal-Wallis 2.333755e-14
## 28                     Erysipelotrichales_O Kruskal-Wallis 7.511336e-10
## 29                        Lactobacillales_O Kruskal-Wallis 9.824748e-11
## 30    Peptostreptococcales-Tissierellales_O Kruskal-Wallis 8.737004e-17
## 31                        Oscillospirales_O Kruskal-Wallis 4.294684e-05
## 32                              vadinHA49_C Kruskal-Wallis 4.885983e-06
## 33                     Desulfitobacteriia_C Kruskal-Wallis 5.530423e-05
## 34                         Microgenomatia_C Kruskal-Wallis 2.221509e-05
## 35                        Dehalococcoidia_C Kruskal-Wallis 1.999580e-05
## 36                          Phycisphaerae_C Kruskal-Wallis 3.006549e-07
## 37                         Planctomycetes_C Kruskal-Wallis 4.398873e-05
## 38                           Rhodothermia_C Kruskal-Wallis 5.134815e-09
## 39                         Cyanobacteriia_C Kruskal-Wallis 2.599878e-06
## 40                           Anaerolineae_C Kruskal-Wallis 1.126492e-19
## 41                       Verrucomicrobiae_C Kruskal-Wallis 2.572550e-09
## 42                         Coriobacteriia_C Kruskal-Wallis 4.373663e-22
## 43                         Actinobacteria_C Kruskal-Wallis 1.748252e-19
## 44                                Bacilli_C Kruskal-Wallis 6.906521e-12
## 45                    Gammaproteobacteria_C Kruskal-Wallis 2.793530e-05
## 46                          Pirellulaceae_F Kruskal-Wallis 3.196572e-09
## 47                        Anaerolineaceae_F Kruskal-Wallis 3.746162e-09
## 48                          Neisseriaceae_F Kruskal-Wallis 1.969734e-05
## 49                        Microscillaceae_F Kruskal-Wallis 3.202746e-09
## 50                        Pedosphaeraceae_F Kruskal-Wallis 1.302641e-15
## 51                                    A4b_F Kruskal-Wallis 5.664201e-18
## 52                     Acholeplasmataceae_F Kruskal-Wallis 1.552118e-08
## 53                                UCG-011_F Kruskal-Wallis 1.038745e-06
## 54                       Selenomonadaceae_F Kruskal-Wallis 3.125086e-06
## 55     [Clostridium] methylpentosum group_F Kruskal-Wallis 2.364971e-18
## 56                      Marinilabiliaceae_F Kruskal-Wallis 2.857166e-19
## 57                      Flavobacteriaceae_F Kruskal-Wallis 1.274939e-05
## 58                      Staphylococcaceae_F Kruskal-Wallis 4.246920e-22
## 59                     Porphyromonadaceae_F Kruskal-Wallis 5.124256e-12
## 60                        Eggerthellaceae_F Kruskal-Wallis 2.117931e-18
## 61                        Barnesiellaceae_F Kruskal-Wallis 2.522551e-07
## 62                          Monoglobaceae_F Kruskal-Wallis 2.153674e-06
## 63                     Bifidobacteriaceae_F Kruskal-Wallis 1.699744e-15
## 64                            Bacillaceae_F Kruskal-Wallis 1.838790e-09
## 65                 Hungateiclostridiaceae_F Kruskal-Wallis 1.127118e-12
## 66                         Marinifilaceae_F Kruskal-Wallis 7.717807e-11
## 67                       Streptococcaceae_F Kruskal-Wallis 6.582098e-14
## 68                  Peptostreptococcaceae_F Kruskal-Wallis 1.042119e-16
## 69              Erysipelatoclostridiaceae_F Kruskal-Wallis 3.073290e-13
## 70                         Clostridiaceae_F Kruskal-Wallis 9.938201e-16
## 71                    Erysipelotrichaceae_F Kruskal-Wallis 1.671406e-05
## 72                         Tannerellaceae_F Kruskal-Wallis 4.258468e-05
## 73  [Eubacterium] coprostanoligenes group_F Kruskal-Wallis 2.001438e-06
## 74                      Butyricicoccaceae_F Kruskal-Wallis 2.897180e-05
## 75                         Muribaculaceae_F Kruskal-Wallis 3.758091e-05
## 76                        Ruminococcaceae_F Kruskal-Wallis 1.458702e-08
## 77                           Halobacillus_G Kruskal-Wallis 4.880366e-05
## 78                              Megamonas_G Kruskal-Wallis 2.480778e-05
## 79                              Hespellia_G Kruskal-Wallis 1.148969e-05
## 80            Lachnospiraceae NK4B4 group_G Kruskal-Wallis 3.406312e-05
## 81                                XBB1006_G Kruskal-Wallis 4.544741e-06
## 82                     Macellibacteroides_G Kruskal-Wallis 8.252963e-10
## 83                           Paludibacter_G Kruskal-Wallis 7.121028e-06
## 84                            Anaerovorax_G Kruskal-Wallis 3.805548e-08
## 85                 Candidatus Phytoplasma_G Kruskal-Wallis 7.466909e-06
## 86                            Selenomonas_G Kruskal-Wallis 1.912782e-05
## 87                Lachnospiraceae UCG-009_G Kruskal-Wallis 9.999884e-06
## 88            [Eubacterium] nodatum group_G Kruskal-Wallis 2.465934e-07
## 89                          Mogibacterium_G Kruskal-Wallis 4.320301e-05
## 90                          Papillibacter_G Kruskal-Wallis 4.897195e-06
## 91                           Anaerococcus_G Kruskal-Wallis 3.272729e-12
## 92                                 CAG-56_G Kruskal-Wallis 9.881041e-07
## 93                         Proteiniphilum_G Kruskal-Wallis 4.464993e-13
## 94                      Ruminiclostridium_G Kruskal-Wallis 1.142188e-05
## 95                        Syntrophococcus_G Kruskal-Wallis 2.488686e-12
## 96                     Pseudobutyrivibrio_G Kruskal-Wallis 6.026604e-06
## 97                          Butyricimonas_G Kruskal-Wallis 1.196336e-05
## 98                      Caproiciproducens_G Kruskal-Wallis 1.550165e-05
## 99                 Prevotellaceae UCG-004_G Kruskal-Wallis 4.891415e-14
## 100                               UBA1819_G Kruskal-Wallis 1.981127e-19
## 101                           Barnesiella_G Kruskal-Wallis 3.908806e-06
## 102           Erysipelotrichaceae UCG-003_G Kruskal-Wallis 6.623150e-10
## 103           Rikenellaceae RC9 gut group_G Kruskal-Wallis 3.864326e-16
## 104                         Porphyromonas_G Kruskal-Wallis 1.759612e-14
## 105                        Staphylococcus_G Kruskal-Wallis 1.471866e-23
## 106                                    A2_G Kruskal-Wallis 3.435145e-21
## 107          Lachnospiraceae NC2004 group_G Kruskal-Wallis 2.010754e-07
## 108                       Bifidobacterium_G Kruskal-Wallis 1.343449e-14
## 109                            Monoglobus_G Kruskal-Wallis 2.153674e-06
## 110         Lachnospiraceae XPB1014 group_G Kruskal-Wallis 8.116506e-06
## 111                            Romboutsia_G Kruskal-Wallis 3.566692e-13
## 112                           Odoribacter_G Kruskal-Wallis 9.910652e-11
## 113                        Acetitomaculum_G Kruskal-Wallis 2.040740e-06
## 114                            Tyzzerella_G Kruskal-Wallis 3.609278e-05
## 115           Clostridium sensu stricto 1_G Kruskal-Wallis 2.878502e-17
## 116                        Marvinbryantia_G Kruskal-Wallis 1.915920e-11
## 117                         NK4A214 group_G Kruskal-Wallis 1.881102e-08
## 118          Lachnospiraceae NK3A20 group_G Kruskal-Wallis 1.638832e-05
## 119                         Streptococcus_G Kruskal-Wallis 6.665449e-13
## 120                      Fusicatenibacter_G Kruskal-Wallis 1.087316e-07
## 121                                 Dorea_G Kruskal-Wallis 1.466599e-05
## 122                          Anaerostipes_G Kruskal-Wallis 2.997463e-10
## 123                       Subdoligranulum_G Kruskal-Wallis 5.234297e-15
## 124                       Parabacteroides_G Kruskal-Wallis 3.893385e-05
## 125          [Ruminococcus] torques group_G Kruskal-Wallis 9.216209e-09
## 126                      Faecalibacterium_G Kruskal-Wallis 7.571240e-10
## 127                     Lachnoclostridium_G Kruskal-Wallis 1.729484e-07
## 128         Lachnospiraceae NK4A136 group_G Kruskal-Wallis 4.135324e-10
## 129                          Agathobacter_G Kruskal-Wallis 4.325760e-06
##     Normal_Distribution Adjusted_P_Value_Bonferroni Adjusted_P_Value_Holm
## 1                    No                3.084681e-04          2.796972e-04
## 2                    No                6.576800e-04          5.955614e-04
## 3                    No                4.705606e-07          4.438937e-07
## 4                    No                2.636142e-06          2.458740e-06
## 5                    No                1.239347e-12          1.204229e-12
## 6                    No                5.541434e-03          4.900276e-03
## 7                    No                5.349862e-20          5.343545e-20
## 8                    No                7.136139e-06          6.596927e-06
## 9                    No                5.590534e-20          5.577333e-20
## 10                   No                2.331423e-03          2.083692e-03
## 11                   No                4.684268e-02          3.981905e-02
## 12                   No                2.707496e-06          2.522095e-06
## 13                   No                3.172999e-06          2.944483e-06
## 14                   No                1.587370e-03          1.431819e-03
## 15                   No                1.103337e-12          1.074679e-12
## 16                   No                1.314644e-05          1.210652e-05
## 17                   No                2.861738e-16          2.821194e-16
## 18                   No                4.848815e-07          4.568305e-07
## 19                   No                7.026907e-05          6.437875e-05
## 20                   No                2.075481e-18          2.060779e-18
## 21                   No                1.824162e-03          1.638946e-03
## 22                   No                1.439683e-12          1.397190e-12
## 23                   No                4.413080e-14          4.324506e-14
## 24                   No                6.302558e-07          5.923065e-07
## 25                   No                8.256495e-05          7.554644e-05
## 26                   No                3.964750e-19          3.941345e-19
## 27                   No                1.976691e-11          1.906678e-11
## 28                   No                6.362102e-07          5.971512e-07
## 29                   No                8.321561e-08          7.889272e-08
## 30                   No                7.400242e-14          7.242976e-14
## 31                   No                3.637598e-02          3.109352e-02
## 32                   No                4.138427e-03          3.674259e-03
## 33                   No                4.684268e-02          3.981905e-02
## 34                   No                1.881618e-02          1.628366e-02
## 35                   No                1.693644e-02          1.467692e-02
## 36                   No                2.546547e-04          2.312036e-04
## 37                   No                3.725845e-02          3.175986e-02
## 38                   No                4.349188e-06          4.025695e-06
## 39                   No                2.202096e-03          1.970707e-03
## 40                   No                9.541384e-17          9.451265e-17
## 41                   No                2.178950e-06          2.034887e-06
## 42                   No                3.704493e-19          3.686998e-19
## 43                   No                1.480770e-16          1.465035e-16
## 44                   No                5.849823e-09          5.566656e-09
## 45                   No                2.366120e-02          2.042070e-02
## 46                   No                2.707496e-06          2.522095e-06
## 47                   No                3.172999e-06          2.944483e-06
## 48                   No                1.668365e-02          1.447755e-02
## 49                   No                2.712726e-06          2.522095e-06
## 50                   No                1.103337e-12          1.074679e-12
## 51                   No                4.797578e-15          4.712615e-15
## 52                   No                1.314644e-05          1.210652e-05
## 53                   No                8.798173e-04          7.946401e-04
## 54                   No                2.646948e-03          2.362565e-03
## 55                   No                2.003131e-15          1.970021e-15
## 56                   No                2.420020e-16          2.388591e-16
## 57                   No                1.079873e-02          9.447299e-03
## 58                   No                3.597141e-19          3.584400e-19
## 59                   No                4.340245e-09          4.135275e-09
## 60                   No                1.793888e-15          1.766355e-15
## 61                   No                2.136601e-04          1.942364e-04
## 62                   No                1.824162e-03          1.638946e-03
## 63                   No                1.439683e-12          1.397190e-12
## 64                   No                1.557456e-06          1.456322e-06
## 65                   No                9.546691e-10          9.129658e-10
## 66                   No                6.536983e-08          6.205117e-08
## 67                   No                5.575037e-11          5.364410e-11
## 68                   No                8.826749e-14          8.628746e-14
## 69                   No                2.603076e-10          2.501658e-10
## 70                   No                8.417656e-13          8.208954e-13
## 71                   No                1.415681e-02          1.231826e-02
## 72                   No                3.606922e-02          3.087389e-02
## 73                   No                1.695218e-03          1.527098e-03
## 74                   No                2.453911e-02          2.114941e-02
## 75                   No                3.183103e-02          2.732132e-02
## 76                   No                1.235521e-05          1.139246e-05
## 77                   No                4.133670e-02          3.518744e-02
## 78                   No                2.101219e-02          1.815930e-02
## 79                   No                9.731768e-03          8.536841e-03
## 80                   No                2.885146e-02          2.483201e-02
## 81                   No                3.849396e-03          3.422190e-03
## 82                   No                6.990259e-07          6.544600e-07
## 83                   No                6.031510e-03          5.326529e-03
## 84                   No                3.223299e-05          2.956911e-05
## 85                   No                6.324472e-03          5.577781e-03
## 86                   No                1.620126e-02          1.407808e-02
## 87                   No                8.469902e-03          7.449913e-03
## 88                   No                2.088646e-04          1.901235e-04
## 89                   No                3.659295e-02          3.123578e-02
## 90                   No                4.147924e-03          3.677793e-03
## 91                   No                2.772002e-09          2.644365e-09
## 92                   No                8.369242e-04          7.568877e-04
## 93                   No                3.781849e-10          3.625574e-10
## 94                   No                9.674336e-03          8.497882e-03
## 95                   No                2.107917e-09          2.013347e-09
## 96                   No                5.104534e-03          4.519953e-03
## 97                   No                1.013297e-02          8.876815e-03
## 98                   No                1.312990e-02          1.145572e-02
## 99                   No                4.143029e-11          3.991395e-11
## 100                  No                1.678015e-16          1.658204e-16
## 101                  No                3.310759e-03          2.951148e-03
## 102                  No                5.609808e-07          5.278651e-07
## 103                  No                3.273084e-13          3.195798e-13
## 104                  No                1.490392e-11          1.439363e-11
## 105                  No                1.246671e-20          1.246671e-20
## 106                  No                2.909568e-18          2.885522e-18
## 107                  No                1.703109e-04          1.552302e-04
## 108                  No                1.137902e-11          1.100285e-11
## 109                  No                1.824162e-03          1.638946e-03
## 110                  No                6.874681e-03          6.054913e-03
## 111                  No                3.020988e-10          2.899721e-10
## 112                  No                8.394322e-08          7.948343e-08
## 113                  No                1.728507e-03          1.555044e-03
## 114                  No                3.057059e-02          2.627555e-02
## 115                  No                2.438091e-14          2.392035e-14
## 116                  No                1.622785e-08          1.542316e-08
## 117                  No                1.593293e-05          1.463497e-05
## 118                  No                1.388090e-02          1.209458e-02
## 119                  No                5.645635e-10          5.405679e-10
## 120                  No                9.209569e-05          8.415828e-05
## 121                  No                1.242210e-02          1.085283e-02
## 122                  No                2.538851e-07          2.400967e-07
## 123                  No                4.433449e-12          4.292123e-12
## 124                  No                3.297697e-02          2.826597e-02
## 125                  No                7.806129e-06          7.207076e-06
## 126                  No                6.412840e-07          6.011565e-07
## 127                  No                1.464873e-04          1.336891e-04
## 128                  No                3.502620e-07          3.308260e-07
## 129                  No                3.663919e-03          3.261623e-03
##     Adjusted_P_Value_BH test_pass
## 1          3.855851e-06         Y
## 2          8.119506e-06         Y
## 3          9.603279e-09         Y
## 4          4.447092e-08         Y
## 5          4.957386e-14         Y
## 6          5.597408e-05         Y
## 7          1.863511e-20         Y
## 8          1.097868e-07         Y
## 9          1.863511e-20         Y
## 10         2.562004e-05         Y
## 11         3.631216e-04         Y
## 12         4.447092e-08         Y
## 13         5.036506e-08         Y
## 14         1.889727e-05         Y
## 15         4.597237e-14         Y
## 16         1.905281e-07         Y
## 17         2.201337e-17         Y
## 18         9.697631e-09         Y
## 19         9.759594e-07         Y
## 20         2.964973e-19         Y
## 21         2.049620e-05         Y
## 22         5.332161e-14         Y
## 23         2.451711e-15         Y
## 24         1.187563e-08         Y
## 25         1.131027e-06         Y
## 26         6.607916e-20         Y
## 27         6.376422e-13         Y
## 28         1.187563e-08         Y
## 29         1.824853e-09         Y
## 30         3.894864e-15         Y
## 31         2.927436e-04         Y
## 32         4.276210e-05         Y
## 33         3.631216e-04         Y
## 34         1.636190e-04         Y
## 35         1.485653e-04         Y
## 36         3.223477e-06         Y
## 37         2.957020e-04         Y
## 38         6.795606e-08         Y
## 39         2.446774e-05         Y
## 40         1.060154e-17         Y
## 41         3.822719e-08         Y
## 42         6.607916e-20         Y
## 43         1.480770e-17         Y
## 44         1.392815e-10         Y
## 45         2.022325e-04         Y
## 46         4.447092e-08         Y
## 47         5.036506e-08         Y
## 48         1.476429e-04         Y
## 49         4.447092e-08         Y
## 50         4.597237e-14         Y
## 51         2.998486e-16         Y
## 52         1.905281e-07         Y
## 53         1.060021e-05         Y
## 54         2.877117e-05         Y
## 55         1.335421e-16         Y
## 56         2.016683e-17         Y
## 57         1.009228e-04         Y
## 58         6.607916e-20         Y
## 59         1.058596e-10         Y
## 60         1.281348e-16         Y
## 61         2.739232e-06         Y
## 62         2.049620e-05         Y
## 63         5.332161e-14         Y
## 64         2.781171e-08         Y
## 65         2.512287e-11         Y
## 66         1.485678e-09         Y
## 67         1.689405e-12         Y
## 68         4.413374e-15         Y
## 69         7.656107e-12         Y
## 70         3.826207e-14         Y
## 71         1.275388e-04         Y
## 72         2.927436e-04         Y
## 73         1.994375e-05         Y
## 74         2.079586e-04         Y
## 75         2.630664e-04         Y
## 76         1.844061e-07         Y
## 77         3.254858e-04         Y
## 78         1.811396e-04         Y
## 79         9.268351e-05         Y
## 80         2.424492e-04         Y
## 81         4.051996e-05         Y
## 82         1.270956e-08         Y
## 83         6.031510e-05         Y
## 84         4.539858e-07         Y
## 85         6.261853e-05         Y
## 86         1.446541e-04         Y
## 87         8.223205e-05         Y
## 88         2.712527e-06         Y
## 89         2.927436e-04         Y
## 90         4.276210e-05         Y
## 91         6.930005e-11         Y
## 92         1.020639e-05         Y
## 93         1.050514e-11         Y
## 94         9.268351e-05         Y
## 95         5.404915e-11         Y
## 96         5.208708e-05         Y
## 97         9.559404e-05         Y
## 98         1.204578e-04         Y
## 99         1.294696e-12         Y
## 100        1.525468e-17         Y
## 101        3.559956e-05         Y
## 102        1.099962e-08         Y
## 103        1.558612e-14         Y
## 104        4.967972e-13         Y
## 105        1.246671e-20         Y
## 106        3.636960e-19         Y
## 107        2.240932e-06         Y
## 108        3.923799e-13         Y
## 109        2.049620e-05         Y
## 110        6.739883e-05         Y
## 111        8.631396e-12         Y
## 112        1.824853e-09         Y
## 113        2.009892e-05         Y
## 114        2.547549e-04         Y
## 115        1.434171e-15         Y
## 116        3.773918e-10         Y
## 117        2.276133e-07         Y
## 118        1.261900e-04         Y
## 119        1.525847e-11         Y
## 120        1.244536e-06         Y
## 121        1.150194e-04         Y
## 122        5.401810e-09         Y
## 123        1.583375e-13         Y
## 124        2.703030e-04         Y
## 125        1.182747e-07         Y
## 126        1.187563e-08         Y
## 127        1.953164e-06         Y
## 128        7.297125e-09         Y
## 129        3.897786e-05         Y
  • Посмотрим на данные здоровых людей и людей с СРК как на многомерный вектор по всем таксонам и группам
combined_bacteria_G <- combined_bacteria_clean %>%
  select(Health_state, ends_with("_G"))

d <- dist(combined_bacteria_G) # euclidean distances between the rows
## Warning in dist(combined_bacteria_G): в результате преобразования созданы NA
fit <- cmdscale(d,eig=TRUE, k=2) # k is the number of dim

df_mds <- data.frame(
  x = fit$points[,1],
  y = fit$points[,2]
  )  

df_full <- cbind(df_mds, combined_info) %>% mutate(Health_state_n = case_when(Health_state == "Health"  ~ 0,
                                                                              Health_state == "Disease" ~ 1))

ggplot(df_full, aes(x = x, y = y, color = Health_state)) +
  geom_point() +
  theme_bw() +
  ggtitle("Распределение вектора таксонов в зависимости от группы пациентов")

  • Используя метод пермутаций проверим отличаются ли группы в зависимомти от Health_state
adonis2(d ~ Health_state_n, data = df_full)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = d ~ Health_state_n, data = df_full)
##                 Df SumOfSqs      R2      F Pr(>F)    
## Health_state_n   1     2519 0.02181 8.4505  0.001 ***
## Residual       379   112971 0.97819                  
## Total          380   115490 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Тест Манна-Уитни для сравнения каждого таксона между больными и здоровыми

  • после округления всех значений до целого
Wilcox_comparison_round_0 <- data_wide %>% 
  select(Health_state, Archaea, Bacteria,
         ends_with(c("_D", "_P", "_O", "_C", "_F", "_G"))) %>% 
  mutate(across (where (is.numeric), function (x) round (x,0))) %>% 
  summarise_if (is.numeric, function (x) (wilcox.test(x ~ .$Health_state)$p.value)) %>% 
  pivot_longer(everything()) %>% 
  rename (Taxon = name, p_value = value) %>% 
  filter (p_value <= 0.05 ) %>% 
  arrange(p_value) %>% 
  add_column(p_value_holm = p.adjust(.$p_value, "holm")) %>% 
  add_column(p_value_BH = p.adjust(.$p_value, "BH"))

 rbind (
   "Количество значимо различающихся таксонов по p_value" = nrow (Wilcox_comparison_round_0),
 "Количество значимо различающихся таксонов по p_value_holm" = nrow (Wilcox_comparison_round_0 %>% filter (p_value_holm <= 0.05 )),
 "Количество значимо различающихся таксонов по p_value_BH" = nrow (Wilcox_comparison_round_0 %>% filter (p_value_BH <= 0.05 ))
 )
##                                                           [,1]
## Количество значимо различающихся таксонов по p_value       139
## Количество значимо различающихся таксонов по p_value_holm   65
## Количество значимо различающихся таксонов по p_value_BH    139
  • после округления всех значений до десятых
Wilcox_comparison_round_1 <- data_wide %>% 
  select(Health_state, Archaea, Bacteria,
         ends_with(c("_D", "_P", "_O", "_C", "_F", "_G"))) %>% 
  mutate(across (where (is.numeric), function (x) round (x,1))) %>% 
  summarise_if (is.numeric, function (x) (wilcox.test(x ~ .$Health_state)$p.value)) %>% 
  pivot_longer(everything()) %>% 
  rename (Taxon = name, p_value = value) %>% 
  filter (p_value <= 0.05 ) %>% 
  arrange(p_value) %>% 
  add_column(p_value_holm = p.adjust(.$p_value, "holm")) %>% 
  add_column(p_value_BH = p.adjust(.$p_value, "BH"))

 rbind (
   "Количество значимо различающихся таксонов по p_value" = nrow (Wilcox_comparison_round_1),
 "Количество значимо различающихся таксонов по p_value_holm" = nrow (Wilcox_comparison_round_1 %>% filter (p_value_holm <= 0.05 )),
 "Количество значимо различающихся таксонов по p_value_BH" = nrow (Wilcox_comparison_round_1 %>% filter (p_value_BH <= 0.05 ))
 )
##                                                           [,1]
## Количество значимо различающихся таксонов по p_value       313
## Количество значимо различающихся таксонов по p_value_holm  179
## Количество значимо различающихся таксонов по p_value_BH    313
  • после округления всех значений до сотых
Wilcox_comparison_round_2 <- data_wide %>% 
  select(Health_state, Archaea, Bacteria,
         ends_with(c("_D", "_P", "_O", "_C", "_F", "_G"))) %>% 
  mutate(across (where (is.numeric), function (x) round (x,2))) %>% 
  summarise_if (is.numeric, function (x) (wilcox.test(x ~ .$Health_state)$p.value)) %>% 
  pivot_longer(everything()) %>% 
  rename (Taxon = name, p_value = value) %>% 
  filter (p_value <= 0.05 ) %>% 
  arrange(p_value) %>% 
  add_column(p_value_holm = p.adjust(.$p_value, "holm")) %>% 
  add_column(p_value_BH = p.adjust(.$p_value, "BH"))

 rbind (
   "Количество значимо различающихся таксонов по p_value" = nrow (Wilcox_comparison_round_2),
 "Количество значимо различающихся таксонов по p_value_holm" = nrow (Wilcox_comparison_round_2 %>% filter (p_value_holm <= 0.05 )),
 "Количество значимо различающихся таксонов по p_value_BH" = nrow (Wilcox_comparison_round_2 %>% filter (p_value_BH <= 0.05 ))
 )
##                                                           [,1]
## Количество значимо различающихся таксонов по p_value       633
## Количество значимо различающихся таксонов по p_value_holm  417
## Количество значимо различающихся таксонов по p_value_BH    633

Распределения таксонов G(Genus - Род)

Гистограмма по количеству пациентов (Здоров - СРК)

create_and_plot_taxon_groups <- function(filter_pattern, step_size, dataset_name) {
  
  # Получаем уникальные таксоны, удовлетворяющие условиям
  unique_taxa <- Wilcox_comparison_round_2 %>%
    subset(grepl(filter_pattern, Taxon)) %>%
    filter(p_value_holm < 0.05) %>%
    distinct(Taxon)
  
  # Разбиваем уникальные таксоны на группы по step_size
  taxon_groups <- split(unique_taxa, rep(1:ceiling(nrow(unique_taxa) / step_size), each = step_size, length.out = nrow(unique_taxa)))
  
  # Проходим по группам и строим графики
  for (i in seq_along(taxon_groups)) {
    current_taxa <- taxon_groups[[i]]
    
    current_dataset <- get(dataset_name)  # Получаем датасет по его имени
    
    G_long_filtered <- current_dataset %>%
      filter(Taxon %in% current_taxa$Taxon & Percentage > 0) %>%
      group_by(Health_state, Taxon) %>%
      summarise(Count = n(), .groups = "drop")
    
    bar_plot <- ggplot(G_long_filtered, aes(x = Health_state, y = Count, fill = Health_state == "Disease")) +
      geom_bar(position = "dodge", color = "black", stat = "identity") +
      scale_fill_manual(values = c("skyblue", "red"), guide = FALSE) +
      labs(title = paste("Гистограмма для таксонов", (i - 1) * step_size + 1, "до", min(i * step_size, nrow(unique_taxa)), "в разрезе Health_state"),
           x = "Статус пациента",
           y = "Количество пациентов, у которых обнаружен данный таксон") +
      coord_flip() +
      facet_wrap(~Taxon, scales = "free_y", ncol = 2, shrink = 0.7) +  # Уменьшение пространства между фасетами
      theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5),  # Поворот текста на оси X и уменьшение шрифта
            axis.text.y = element_text(hjust = 1, size = 5),
            strip.text = element_text(size = 5))  # Уменьшение шрифта для названия фасет
    
    print(bar_plot)  # Отображение графика на экране
    
  }
}

# Пример вызова функции с передачей "_G" в качестве аргумента и названия датасета "G_long"
create_and_plot_taxon_groups("_G", 18, "G_long")
## Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
## ggplot2 3.3.4.
## ℹ Please use "none" instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

график солнышко

create_and_plot_circular_diagrams <- function(dataset, filter_pattern, step_size, filename) {
  
  # Получаем уникальные таксоны, удовлетворяющие условиям
  unique_taxa <- Wilcox_comparison_round_2 %>%
    subset(grepl(filter_pattern, Taxon)) %>%
    filter(p_value_holm < 0.05) %>%
    distinct(Taxon)
  
  # Разбиваем уникальные таксоны на группы по step_size
  taxon_groups <- split(unique_taxa, rep(1:ceiling(nrow(unique_taxa) / step_size), each = step_size, length.out = nrow(unique_taxa)))
  
  # Проходим по группам и строим графики
  for (i in seq_along(taxon_groups)) {
    current_taxa <- taxon_groups[[i]]
    
    G_long_filtered <- dataset %>%
      filter(Taxon %in% current_taxa$Taxon & Percentage > 0) %>%
      group_by(Health_state, Taxon) %>%
      summarise(Count = n(), .groups = "drop")
    
    # Добавляем строки с Count = 0 для всех комбинаций Health_state и Taxon
    G_long_filtered <- G_long_filtered %>%
      complete(Health_state, Taxon, fill = list(Count = 0))
    
    data_id <- G_long_filtered %>% filter(G_long_filtered$Health_state == "Disease") %>%
      mutate(id = row_number()) %>% 
      select(Taxon, id)
    
    G_long_filtered <- left_join(G_long_filtered, data_id, by = "Taxon")
    
    labels_data <- G_long_filtered %>% filter(Health_state == "Disease")
    number_of_bars <- nrow(labels_data)
    
    #Вычислим углы для лейбла каждого барра
    labels_data$angel <- 90 - 360 * (labels_data$id-0.5) / number_of_bars
    
    # Добавим горизонтольную регулировку
    labels_data <- labels_data %>%
      mutate(hjust = ifelse(angel < -90, 1, 0))
    
    # Перевернем лейбл в зависимости от "полушария"
    labels_data <- labels_data %>%
      mutate(angel = ifelse(angel < -90, angel + 180, angel))
    
    # Круговая диаграмма для объединенного датасета
    p <- ggplot(data = G_long_filtered, mapping = aes(x = id, y = Count, fill = Health_state)) + 
      geom_bar(stat = "identity", position = "stack") + 
      ylim(-150, 500) +
      
      # Добавляем кастомную тему
      theme_minimal(base_size = 8) +
      theme(axis.text   = element_blank(),
            axis.title  = element_blank(),
            panel.grid  = element_blank(),
            plot.margin = unit(rep(-1, 6), "cm")) +
      coord_polar(start = 0)+
      geom_text(data = labels_data,
                aes(x = id, y = Count,
                    label = Taxon,
                    hjust = hjust),
                color = "black",
                fontface = "bold",
                alpha = 0.9,
                size = 1.5,
                angle = labels_data$angel,
                inherit.aes = FALSE) +
      # Добавляем горизонтальные линии сетки
      geom_hline(yintercept = seq(0, 300, by = 50),
                 linetype = "dotted",
                 color = "gray",
                 size = 0.5) +
      # Добавляем значения рядом с линиями сетки
      annotate("text", x = 1.2, y = seq(0, 300, by = 50),
               label = seq(0, 300, by = 50),
               color = "black",
               size = 3,
               alpha = 40,
               hjust = 0)
    
    p <- p + annotate("text", x = 0.5, y = 490, label = "Сотношение здоровых пациентов и пациентов с СРК",
                      size = 5, color = "black", fontface = "bold", hjust = 0.5)
    
    # Сформируем имя файла для сохранения
    current_filename <- paste0(filename, "_", i, ".png")
    
    # Сохраняем график под уникальным именем
    ggsave(current_filename, p, width = 10, height = 8)
    
    print(p)
  }
}

# Пример вызова функции
create_and_plot_circular_diagrams(G_long, "_G", 40, "output_plot")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Боксплот по возрасту

create_and_plot_boxplots <- function(G_long, filter_pattern, step_size, x_var, title) {
  
  # Получаем уникальные таксоны, удовлетворяющие условиям
  unique_taxa <- Wilcox_comparison_round_2 %>%
    subset(grepl(filter_pattern, Taxon)) %>%
    filter(p_value_holm < 0.05) %>%
    distinct(Taxon)
  
  # Разбиваем уникальные таксоны на группы по step_size
  taxon_groups <- split(unique_taxa, rep(1:ceiling(nrow(unique_taxa) / step_size), each = step_size, length.out = nrow(unique_taxa)))
  
  # Проходим по группам и строим боксплоты
  for (i in seq_along(taxon_groups)) {
    current_taxa <- taxon_groups[[i]]
    
    G_long_filtered <- G_long %>%
      filter(Taxon %in% current_taxa$Taxon & Percentage > 0)
    
    # Создаем формулу для переменной x_var
    formula <- as.formula(paste("Age ~", x_var))
    
    # Строим боксплот
    box_plot <- ggplot(G_long_filtered, aes_string(x = x_var, y = "Age", fill = "Health_state")) +
      geom_boxplot() +
      labs(title = paste(title),
           x = x_var,
           y = "Age") +
      theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8),  # Уменьшаем размер шрифта для подписей Taxon
            axis.title.x = element_blank(),  # Убираем название оси X
            legend.position = "bottom") +  # Перемещаем легенду вниз
      scale_fill_manual(values = c("red", "skyblue")) +
      facet_grid(~Taxon, scales = "free_y", space = "free")  # Используем facet_grid вместо facet_wrap
    
    # Отображаем график на экране
    print(box_plot)
  }
}

# Пример вызова функции с Health_state в качестве x_var
create_and_plot_boxplots(G_long, "_G", 4, "Health_state", "Боксплоты для таксонов по статусу: Здоров/СРК")
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 540 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 590 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 674 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 778 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 643 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 460 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 577 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 554 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 366 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 590 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 445 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 500 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 733 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 555 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 472 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 370 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 384 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 318 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 556 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 377 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 417 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 249 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 634 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 244 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 490 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 533 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 390 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 465 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 236 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 210 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 397 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 268 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 503 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 251 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 353 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 199 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 410 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 429 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 183 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 102 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 414 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 73 rows containing non-finite values (`stat_boxplot()`).

Боксплот по возросту и полу

create_and_plot_boxplots(G_long, "_G", 3, "interaction(Health_state, Sex)", "Боксплоты для таксона (Пол + статус заболевания)")
## Warning: Removed 439 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 444 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 364 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 557 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 547 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 580 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 538 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 216 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 398 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 342 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 582 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 175 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 344 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 513 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 423 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 255 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 491 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 653 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 317 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 299 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 342 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 109 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 337 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 284 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 492 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 237 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 235 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 386 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 206 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 305 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 375 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 241 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 460 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 241 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 389 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 323 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 324 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 282 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 214 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 91 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 288 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 335 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 167 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 378 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 232 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 287 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 173 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 111 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 399 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 134 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 327 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 162 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 69 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 369 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 81 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 70 rows containing non-finite values (`stat_boxplot()`).

Боксплот по возросту и статусу курения

create_and_plot_boxplots(G_long, "_G", 3, "interaction(Health_state, Smoking)", "Боксплоты для таксона (Статус курения + статус заболевания)")
## Warning: Removed 439 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 444 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 364 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 557 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 547 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 580 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 538 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 216 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 398 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 342 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 582 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 175 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 344 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 513 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 423 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 255 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 491 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 653 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 317 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 299 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 342 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 109 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 337 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 284 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 492 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 237 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 235 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 386 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 206 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 305 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 375 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 241 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 460 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 241 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 389 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 323 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 324 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 282 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 214 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 91 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 288 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 335 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 167 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 378 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 232 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 287 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 173 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 111 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 399 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 134 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 327 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 162 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 69 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 369 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 81 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 70 rows containing non-finite values (`stat_boxplot()`).

Боксплот по возросту и статусу употребления алкоголя

create_and_plot_boxplots(G_long, "_G", 3, "interaction(Health_state, Alcohol)", "Боксплоты для таксона (Статус употребления алкоголя + статус заболевания)")
## Warning: Removed 439 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 444 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 364 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 557 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 547 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 580 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 538 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 216 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 398 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 342 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 582 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 175 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 344 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 513 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 423 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 255 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 491 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 653 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 317 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 299 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 342 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 109 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 337 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 284 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 492 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 237 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 235 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 386 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 206 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 305 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 375 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 241 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 460 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 241 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 389 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 323 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 324 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 282 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 214 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 91 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 288 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 335 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 167 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 378 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 232 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 287 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 173 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 111 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 399 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 134 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 327 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 162 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 69 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 369 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 81 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 70 rows containing non-finite values (`stat_boxplot()`).

Распределения таксонов F Family(семейство)

Гистограмма по количеству пациентов (Здоров - СРК)

create_and_plot_taxon_groups("_F", 18, "F_long")

Круговая столбчатая диаграмма для отобрадения отношения здоровых пациентов и пациентов с СРК

create_and_plot_circular_diagrams(F_long, "_F", 40, "F_output_plot")

Боксплот по возрасту

create_and_plot_boxplots(F_long, "_F", 4, "Health_state", "Боксплоты для таксонов по статусу: Здоров/СРК")
## Warning: Removed 570 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 721 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 623 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 846 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 729 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 316 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 396 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 451 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 368 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 270 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 257 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 343 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 446 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 300 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 390 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 222 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 330 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 438 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 444 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 225 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 299 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 540 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 607 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 270 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 457 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 260 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 433 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 387 rows containing non-finite values (`stat_boxplot()`).

Боксплот по возросту и полу

create_and_plot_boxplots(F_long, "_F", 3, "interaction(Health_state, Sex)", "Боксплоты для таксона (Пол + статус заболевания)")
## Warning: Removed 454 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 565 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 410 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 485 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 601 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 538 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 554 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 198 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 351 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 215 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 332 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 317 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 224 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 240 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 136 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 270 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 411 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 224 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 157 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 344 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 183 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 313 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 243 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 251 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 373 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 145 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 169 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 281 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 324 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 667 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 248 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 178 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 210 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 418 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 202 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 320 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 387 rows containing non-finite values (`stat_boxplot()`).

Боксплот по возросту и статусу курения

create_and_plot_boxplots(F_long, "_F", 3, "interaction(Health_state, Smoking)", "Боксплоты для таксона (Статус курения + статус заболевания)")
## Warning: Removed 454 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 565 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 410 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 485 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 601 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 538 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 554 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 198 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 351 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 215 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 332 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 317 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 224 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 240 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 136 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 270 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 411 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 224 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 157 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 344 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 183 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 313 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 243 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 251 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 373 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 145 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 169 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 281 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 324 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 667 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 248 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 178 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 210 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 418 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 202 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 320 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 387 rows containing non-finite values (`stat_boxplot()`).

Боксплот по возросту и статусу употребления алкоголя

create_and_plot_boxplots(F_long, "_F", 3, "interaction(Health_state, Alcohol)", "Боксплоты для таксона (Статус употребления алкоголя + статус заболевания)")
## Warning: Removed 454 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 565 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 410 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 485 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 601 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 538 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 554 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 198 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 351 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 215 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 332 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 317 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 224 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 240 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 136 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 270 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 411 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 224 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 157 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 344 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 183 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 313 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 243 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 251 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 373 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 145 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 169 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 281 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 324 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 667 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 248 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 178 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 210 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 418 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 202 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 320 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 387 rows containing non-finite values (`stat_boxplot()`).

Распределения таксонов C - Class (класс)

Гистограмма по количеству пациентов (Здоров - СРК)

create_and_plot_taxon_groups("_C", 18, "C_long")

Круговая столбчатая диаграмма для отобрадения отношения здоровых пациентов и пациентов с СРК

create_and_plot_circular_diagrams(C_long, "_C", 40, "F_output_plot")

Боксплот по возрасту

create_and_plot_boxplots(C_long, "_C", 4, "Health_state", "Боксплоты для таксонов по статусу: Здоров/СРК")
## Warning: Removed 822 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 484 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 245 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 390 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 201 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 199 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 144 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 342 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 103 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 510 rows containing non-finite values (`stat_boxplot()`).

Боксплот по возросту и полу

create_and_plot_boxplots(C_long, "_C", 3, "interaction(Health_state, Sex)", "Боксплоты для таксона (Пол + статус заболевания)")
## Warning: Removed 588 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 406 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 375 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 182 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 155 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 331 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 146 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 158 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 118 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 306 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 96 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 69 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 460 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 50 rows containing non-finite values (`stat_boxplot()`).

Боксплот по возросту и статусу курения

create_and_plot_boxplots(C_long, "_C", 3, "interaction(Health_state, Smoking)", "Боксплоты для таксона (Статус курения + статус заболевания)")
## Warning: Removed 588 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 406 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 375 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 182 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 155 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 331 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 146 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 158 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 118 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 306 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 96 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 69 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 460 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 50 rows containing non-finite values (`stat_boxplot()`).

Боксплот по возросту и статусу употребления алкоголя

create_and_plot_boxplots(C_long, "_C", 3, "interaction(Health_state, Alcohol)", "Боксплоты для таксона (Статус употребления алкоголя + статус заболевания)")
## Warning: Removed 588 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 406 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 375 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 182 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 155 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 331 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 146 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 158 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 118 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 306 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 96 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 69 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 460 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 50 rows containing non-finite values (`stat_boxplot()`).

Распределения таксонов O - Order (порядок)

Гистограмма по количеству пациентов (Здоров - СРК)

create_and_plot_taxon_groups("_O", 18, "O_long")

Круговая столбчатая диаграмма для отобрадения отношения здоровых пациентов и пациентов с СРК

create_and_plot_circular_diagrams(O_long, "_O", 30, "F_output_plot")

Боксплот по возрасту

create_and_plot_boxplots(O_long, "_O", 4, "Health_state", "Боксплоты для таксонов по статусу: Здоров/СРК")
## Warning: Removed 611 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 747 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 817 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 525 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 628 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 505 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 306 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 365 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 308 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 164 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 581 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 284 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 174 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 200 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 311 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 277 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 95 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 338 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 62 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 582 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 50 rows containing non-finite values (`stat_boxplot()`).

Боксплот по возросту и полу

create_and_plot_boxplots(O_long, "_O", 3, "interaction(Health_state, Sex)", "Боксплоты для таксона (Пол + статус заболевания)")
## Warning: Removed 515 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 555 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 535 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 570 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 320 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 513 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 484 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 341 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 261 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 334 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 144 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 240 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 130 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 500 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 168 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 231 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 121 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 195 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 82 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 287 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 255 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 78 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 60 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 317 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 47 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 112 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 535 rows containing non-finite values (`stat_boxplot()`).

Боксплот по возросту и статусу курения

create_and_plot_boxplots(O_long, "_O", 3, "interaction(Health_state, Smoking)", "Боксплоты для таксона (Статус курения + статус заболевания)")
## Warning: Removed 515 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 555 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 535 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 570 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 320 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 513 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 484 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 341 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 261 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 334 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 144 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 240 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 130 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 500 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 168 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 231 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 121 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 195 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 82 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 287 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 255 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 78 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 60 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 317 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 47 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 112 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 535 rows containing non-finite values (`stat_boxplot()`).

Боксплот по возросту и статусу употребления алкоголя

create_and_plot_boxplots(O_long, "_O", 3, "interaction(Health_state, Alcohol)", "Боксплоты для таксона (Статус употребления алкоголя + статус заболевания)")
## Warning: Removed 515 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 555 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 535 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 570 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 320 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 513 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 484 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 341 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 261 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 334 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 144 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 240 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 130 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 500 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 168 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 231 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 121 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 195 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 82 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 287 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 255 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 78 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 60 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 317 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 47 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 112 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 535 rows containing non-finite values (`stat_boxplot()`).

Распределения таксонов P - Phylum (тип)

Гистограмма по количеству пациентов (Здоров - СРК)

create_and_plot_taxon_groups("_P", 18, "P_long")

Круговая столбчатая диаграмма для отобрадения отношения здоровых пациентов и пациентов с СРК

create_and_plot_circular_diagrams(P_long, "_P", 30, "P_output_plot")

Боксплот по возрасту

create_and_plot_boxplots(P_long, "_P", 4, "Health_state", "Боксплоты для таксонов по статусу: Здоров/СРК")
## Warning: Removed 742 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 373 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 510 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 161 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 41 rows containing non-finite values (`stat_boxplot()`).

Боксплот по возросту и полу

create_and_plot_boxplots(P_long, "_P", 3, "interaction(Health_state, Sex)", "Боксплоты для таксона (Пол + статус заболевания)")
## Warning: Removed 660 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 366 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 336 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 263 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 112 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 90 rows containing non-finite values (`stat_boxplot()`).

Боксплот по возросту и статусу курения

create_and_plot_boxplots(P_long, "_P", 3, "interaction(Health_state, Smoking)", "Боксплоты для таксона (Статус курения + статус заболевания)")
## Warning: Removed 660 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 366 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 336 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 263 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 112 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 90 rows containing non-finite values (`stat_boxplot()`).

Боксплот по возросту и статусу употребления алкоголя

create_and_plot_boxplots(P_long, "_P", 3, "interaction(Health_state, Alcohol)", "Боксплоты для таксона (Статус употребления алкоголя + статус заболевания)")
## Warning: Removed 660 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 366 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 336 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 263 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 112 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 90 rows containing non-finite values (`stat_boxplot()`).

Посмотрим на данные здоровых людей и людей с СРК как на многомерный вектор по всем таксонам и группам

G - Genus

perform_permutation_test <- function(data, filter_pattern) {
  
  combined_bacteria_G <- data %>%
    select(Health_state, ends_with(filter_pattern))
  
  d <- dist(combined_bacteria_G) # euclidean distances between the rows
  fit <- cmdscale(d, eig=TRUE, k=2) # k is the number of dim
  
  df_mds <- data.frame(
    x = fit$points[,1],
    y = fit$points[,2]
  )  
  
  df_full <- cbind(df_mds, combined_info) %>% 
    mutate(Health_state_n = case_when(Health_state == "Health"  ~ 0,
                                      Health_state == "Disease" ~ 1))
  
  msd_plot <- ggplot(df_full, aes(x = x, y = y, color = Health_state)) +
    geom_point() +
    theme_bw() +
    ggtitle("Распределение вектора таксонов в зависимости от группы пациентов")
  
  print(msd_plot)
  
  print("Используя метод пермутаций проверим отличаются ли группы в зависимости от Health_state")
  
  adonis2(d ~ Health_state_n, data = df_full)
}

# Пример вызова функции с использованием другого фильтра
perform_permutation_test(combined_bacteria_clean, "_G")
## Warning in dist(combined_bacteria_G): в результате преобразования созданы NA

## [1] "Используя метод пермутаций проверим отличаются ли группы в зависимости от Health_state"
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = d ~ Health_state_n, data = df_full)
##                 Df SumOfSqs      R2      F Pr(>F)    
## Health_state_n   1     2519 0.02181 8.4505  0.001 ***
## Residual       379   112971 0.97819                  
## Total          380   115490 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

F - Family (семейство)

perform_permutation_test(combined_bacteria_clean, "_F")
## Warning in dist(combined_bacteria_G): в результате преобразования созданы NA

## [1] "Используя метод пермутаций проверим отличаются ли группы в зависимости от Health_state"
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = d ~ Health_state_n, data = df_full)
##                 Df SumOfSqs     R2      F Pr(>F)    
## Health_state_n   1     4139 0.0227 8.8047  0.001 ***
## Residual       379   178176 0.9773                  
## Total          380   182315 1.0000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

C - Class (класс)

perform_permutation_test(combined_bacteria_clean, "_C")
## Warning in dist(combined_bacteria_G): в результате преобразования созданы NA

## [1] "Используя метод пермутаций проверим отличаются ли группы в зависимости от Health_state"
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = d ~ Health_state_n, data = df_full)
##                 Df SumOfSqs      R2      F Pr(>F)  
## Health_state_n   1     3161 0.01019 3.9015  0.018 *
## Residual       379   307043 0.98981                
## Total          380   310204 1.00000                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

O - Order (порядок)

perform_permutation_test(combined_bacteria_clean, "_O")
## Warning in dist(combined_bacteria_G): в результате преобразования созданы NA

## [1] "Используя метод пермутаций проверим отличаются ли группы в зависимости от Health_state"
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = d ~ Health_state_n, data = df_full)
##                 Df SumOfSqs      R2      F Pr(>F)   
## Health_state_n   1     3335 0.01602 6.1714  0.002 **
## Residual       379   204808 0.98398                 
## Total          380   208143 1.00000                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

P - Phylum (тип)

perform_permutation_test(combined_bacteria_clean, "_P")
## Warning in dist(combined_bacteria_G): в результате преобразования созданы NA

## [1] "Используя метод пермутаций проверим отличаются ли группы в зависимости от Health_state"
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = d ~ Health_state_n, data = df_full)
##                 Df SumOfSqs      R2      F Pr(>F)  
## Health_state_n   1     3746 0.01138 4.3612  0.014 *
## Residual       379   325550 0.98862                
## Total          380   329297 1.00000                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Кластеризация родов (G-таксонов)

AGNES (agglomerative clustering)

clusters_number <- 2 #количество кластеров

library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
G_scaled <- data_wide %>% 
  column_to_rownames("patient_ID") %>% 
  select (ends_with("_G")) %>% #выбираем только G-таксоны
  scale () #нормируем данные

agnes <- G_scaled %>% 
  dist (method = "euclidean") %>% #cоздаём матрицу дистанций
  as.matrix() %>%  
  agnes( #выбираем Agglomerative clustering (иерархическая кластеризация снизу вверх)
    stand = F, #стандартизируем данныe
    metric = "euclidean", 
    method = "ward")

fviz_dend(agnes, 
          cex = 0.5, k = clusters_number,
          rect = TRUE,
          k_colors = "jco",
          show_labels = FALSE,
          main = "Дендрограмма кластеризации субъектов по количеству G-таксонов",
  ylab = "Высота",)
## Registered S3 method overwritten by 'dendextend':
##   method     from 
##   rev.hclust vegan

  • Сравнение полученных кластеров по основным переменным
data_wide %>% 
  add_column("Cluster" = cutree(agnes, k = clusters_number)) %>% # Создаём новый столбец - вектор принадлежности к кластерам  
  select(2:19, Cluster) %>% 
  tbl_summary(digits = list(all_continuous() ~ c(0, 0),
                            all_categorical() ~ c(0, 0)),
              by = Cluster) %>% 
  add_p()
Characteristic 1, N = 1951 2, N = 1861 p-value2
research_ID

<0.001
    1 46 (24%) 0 (0%)
    2 14 (7%) 22 (12%)
    3 48 (25%) 22 (12%)
    4 37 (19%) 50 (27%)
    5 25 (13%) 0 (0%)
    6 13 (7%) 9 (5%)
    7 12 (6%) 83 (45%)
Seq_region

<0.001
    V3-V4 62 (32%) 44 (24%)
    V4 87 (45%) 142 (76%)
    V5-V6 46 (24%) 0 (0%)
Seq_date

<0.001
    2015 12 (6%) 83 (45%)
    2017 25 (13%) 0 (0%)
    2018 67 (34%) 29 (16%)
    2020 15 (8%) 24 (13%)
    2021 15 (8%) 17 (9%)
    2022 13 (7%) 10 (5%)
    2023 48 (25%) 23 (12%)
Health_state

<0.001
    Disease 59 (30%) 111 (60%)
    Health 136 (70%) 75 (40%)
Age 33 (27, 47) 35 (28, 48) 0.9
    Unknown 120 127
Age_range

0.3
    > 43 25 (14%) 19 (18%)
    16-43 158 (86%) 84 (82%)
    Unknown 12 83
Sex

0.007
    female 57 (49%) 22 (29%)
    male 59 (51%) 53 (71%)
    Unknown 79 111
Country


    Australia 3 (2%) 3 (2%)
    Austria 26 (13%) 2 (1%)
    Belgium 46 (24%) 0 (0%)
    Germany 20 (10%) 25 (13%)
    Greece 1 (1%) 0 (0%)
    Hungary 2 (1%) 0 (0%)
    Ireland 0 (0%) 1 (1%)
    Israel 14 (7%) 9 (5%)
    Italy 14 (7%) 23 (12%)
    Norway 0 (0%) 1 (1%)
    Poland 48 (25%) 22 (12%)
    Spain 12 (6%) 83 (45%)
    UK 4 (2%) 8 (4%)
    USA 5 (3%) 9 (5%)
Race

0.089
    African American 1 (1%) 0 (0%)
    Asian or Pacific Islander 0 (0%) 1 (2%)
    Caucasian 76 (93%) 40 (83%)
    Hispanic 1 (1%) 0 (0%)
    Other 4 (5%) 7 (15%)
    Unknown 113 138
Smoking 9 (7%) 7 (10%) 0.5
    Unknown 64 114
Alcohol

0.3
    Never 4 (11%) 8 (16%)
    Occasionally (1-2 times/week) 13 (35%) 11 (22%)
    Rarely (a few times/month) 16 (43%) 20 (40%)
    Regularly (3-7 times/week) 4 (11%) 11 (22%)
    Unknown 158 136
Antibiotics_usage

<0.001
    1-6 months 48 (41%) 0 (0%)
    12 months/Not use 68 (59%) 61 (100%)
    Unknown 79 125
Physical_activity 9 (7%) 7 (10%) 0.5
    Unknown 64 114
Travel_period

0.4
    3 months 9 (25%) 9 (18%)
    6 months 2 (6%) 8 (16%)
    Month 7 (19%) 6 (12%)
    Year 18 (50%) 27 (54%)
    Unknown 159 136
Education_level

0.8
    Bachelor's level 13 (38%) 18 (36%)
    Master's level 10 (29%) 18 (36%)
    School Level 11 (32%) 14 (28%)
    Unknown 161 136
Cosmetics

0.2
    Never 18 (49%) 26 (53%)
    Occasionally (a few-8 times/month) 5 (14%) 12 (24%)
    Regularly (3-7 times/week) 14 (38%) 11 (22%)
    Unknown 158 137
Sleep_duration

0.5
    > 8 hours 4 (11%) 5 (10%)
    4-6 hours 5 (14%) 4 (8%)
    6-7 hours 15 (41%) 16 (32%)
    7-8 hours 13 (35%) 25 (50%)
    Unknown 158 136
BMI_category

0.8
    normal/overweight 139 (98%) 148 (98%)
    obese 3 (2%) 2 (1%)
    underweight 0 (0%) 1 (1%)
    Unknown 53 35
1 n (%); Median (IQR)
2 Pearson’s Chi-squared test; Wilcoxon rank sum test; Fisher’s exact test

kMeans

G_scaled %>% 
  kmeans(centers = clusters_number, # Кол-во центроидов-кластеров
         iter.max = 20, # Максимальное кол-во итераций
         nstart = 35) %>% # Кол-во центройдов в начале
  fviz_cluster(data = G_scaled,
               palette = c("#2E9FDF", "#E7B800"),
               ellipse = FALSE,
               show.clust.cent = FALSE,
               geom = "point",
               main = "Кластеризация субъектов по G-таксонам")

Определение оптимального количества кластеров

  • методом локтя
G_scaled %>% 
fviz_nbclust(hcut, method = "wss") 

  • с помощью коэффициента силуэта
G_scaled %>% 
  fviz_nbclust(hcut, method = "silhouette") 

DBSCAN

  • определением оптимального размера радиуса (eps)
dbscan::kNNdistplot(G_scaled,
                    minPts = 20)  # мин. количество субъектов в кластере
abline(h=37, col = "red", lty = 2)

G_scaled %>% 
dbscan::dbscan(eps = 37, # радиус
       minPts = 20) %>%  # мин. количество субъектов в кластере
  fviz_cluster(
    data = G_scaled, 
    stand = TRUE,
    ellipse = FALSE, 
    show.clust.cent = FALSE,
    geom = "point", 
    palette = "jco", 
    ggtheme = theme_classic())